{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "cd8da54e-fdb8-4359-a296-169b1a415a8a",
   "metadata": {},
   "outputs": [],
   "source": [
    "import os, sys\n",
    "import pandas as pd\n",
    "import scipy \n",
    "import numpy as np\n",
    "import matplotlib.pyplot as plt\n",
    "import seaborn as sns\n",
    "%matplotlib inline\n",
    "from IPython.display import clear_output\n",
    "from sklearn.ensemble import RandomForestClassifier\n",
    "import sklearn.metrics"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "8eafc8f8-61b9-4777-b3e1-b15c3ff78498",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Unnamed: 0</th>\n",
       "      <th>year</th>\n",
       "      <th>leader</th>\n",
       "      <th>party</th>\n",
       "      <th>orientation</th>\n",
       "      <th>manifesto</th>\n",
       "      <th>hate against out-groups</th>\n",
       "      <th>anger</th>\n",
       "      <th>fear and insecurity</th>\n",
       "      <th>indignation</th>\n",
       "      <th>...</th>\n",
       "      <th>hate against out-groups_coder0</th>\n",
       "      <th>hate against out-groups_coder1</th>\n",
       "      <th>anger_coder0</th>\n",
       "      <th>anger_coder1</th>\n",
       "      <th>fear and insecurity_coder0</th>\n",
       "      <th>fear and insecurity_coder1</th>\n",
       "      <th>indignation_coder0</th>\n",
       "      <th>indignation_coder1</th>\n",
       "      <th>joy and pride_coder0</th>\n",
       "      <th>joy and pride_coder1</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0</td>\n",
       "      <td>2006</td>\n",
       "      <td>Prodi</td>\n",
       "      <td>Ulivo</td>\n",
       "      <td>Left wing</td>\n",
       "      <td>0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1</td>\n",
       "      <td>2006</td>\n",
       "      <td>Prodi</td>\n",
       "      <td>Ulivo</td>\n",
       "      <td>Left wing</td>\n",
       "      <td>0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>2</td>\n",
       "      <td>2006</td>\n",
       "      <td>Prodi</td>\n",
       "      <td>Ulivo</td>\n",
       "      <td>Left wing</td>\n",
       "      <td>0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>3</td>\n",
       "      <td>2006</td>\n",
       "      <td>Prodi</td>\n",
       "      <td>Ulivo</td>\n",
       "      <td>Left wing</td>\n",
       "      <td>0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>4</td>\n",
       "      <td>2006</td>\n",
       "      <td>Prodi</td>\n",
       "      <td>Ulivo</td>\n",
       "      <td>Left wing</td>\n",
       "      <td>0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 24 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "   Unnamed: 0  year leader  party orientation  manifesto  \\\n",
       "0           0  2006  Prodi  Ulivo   Left wing          0   \n",
       "1           1  2006  Prodi  Ulivo   Left wing          0   \n",
       "2           2  2006  Prodi  Ulivo   Left wing          0   \n",
       "3           3  2006  Prodi  Ulivo   Left wing          0   \n",
       "4           4  2006  Prodi  Ulivo   Left wing          0   \n",
       "\n",
       "   hate against out-groups  anger  fear and insecurity  indignation  ...  \\\n",
       "0                      0.0    0.0                  0.0          0.0  ...   \n",
       "1                      0.0    0.0                  0.0          0.0  ...   \n",
       "2                      0.0    0.0                  0.0          0.0  ...   \n",
       "3                      0.0    0.0                  0.0          0.0  ...   \n",
       "4                      0.0    0.0                  0.0          0.0  ...   \n",
       "\n",
       "   hate against out-groups_coder0  hate against out-groups_coder1  \\\n",
       "0                             0.0                             0.0   \n",
       "1                             0.0                             0.0   \n",
       "2                             0.0                             0.0   \n",
       "3                             0.0                             0.0   \n",
       "4                             0.0                             0.0   \n",
       "\n",
       "  anger_coder0  anger_coder1  fear and insecurity_coder0  \\\n",
       "0          0.0           0.0                         0.0   \n",
       "1          0.0           0.0                         0.0   \n",
       "2          0.0           0.0                         0.0   \n",
       "3          0.0           0.0                         0.0   \n",
       "4          0.0           0.0                         0.0   \n",
       "\n",
       "   fear and insecurity_coder1  indignation_coder0  indignation_coder1  \\\n",
       "0                         0.0                 0.0                 0.0   \n",
       "1                         0.0                 0.0                 0.0   \n",
       "2                         0.0                 0.0                 0.0   \n",
       "3                         0.0                 0.0                 0.0   \n",
       "4                         0.0                 0.0                 0.0   \n",
       "\n",
       "   joy and pride_coder0  joy and pride_coder1  \n",
       "0                   1.0                   1.0  \n",
       "1                   1.0                   1.0  \n",
       "2                   0.0                   0.0  \n",
       "3                   0.0                   0.0  \n",
       "4                   0.0                   0.0  \n",
       "\n",
       "[5 rows x 24 columns]"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "annotations = [\"hate against out-groups\", \"anger\", \"fear and insecurity\", \"indignation\", \"joy and pride\"]\n",
    "\n",
    "df = pd.read_excel(\"./data/partially_annotated_data.xlsx\")\n",
    "\n",
    "df = df[~df.text.isna()].copy()\n",
    "df['is_na'] = df[annotations].isnull().apply(lambda x: all(x), axis=1) \n",
    "df.index = range(len(df))\n",
    "df.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "e90351b0-c601-449d-b036-514754dade59",
   "metadata": {},
   "source": [
    "# Intercoder reliability - Table D SI"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "id": "1ac9eb9d-c42c-4c35-b45f-4b1ef8dc23f4",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "hate against out-groups  Cohen's Kappa =  0.9940768434550201\n",
      "anger  Cohen's Kappa =  0.9632659225048136\n",
      "fear and insecurity  Cohen's Kappa =  0.9627725546394535\n",
      "indignation  Cohen's Kappa =  0.973125675052457\n",
      "joy and pride  Cohen's Kappa =  0.9590925130424309\n"
     ]
    }
   ],
   "source": [
    "from sklearn.metrics import cohen_kappa_score\n",
    "\n",
    "\n",
    "def cohen_kappa(df,sent):\n",
    "    labels_0 = df[~df[sent+\"_coder0\"].isna()][sent+\"_coder0\"].astype(int).values\n",
    "    labels_1 = df[~df[sent+\"_coder1\"].isna()][sent+\"_coder1\"].astype(int).values\n",
    "    \n",
    "    return cohen_kappa_score(labels_0, labels_1)\n",
    "\n",
    "sentiments = ['hate against out-groups', 'anger', 'fear and insecurity','indignation', 'joy and pride']\n",
    "\n",
    "for sent in sentiments:\n",
    "    ck = cohen_kappa(df,sent)\n",
    "    print(sent, \" Cohen's Kappa = \", ck)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "f2e4e238-7ddb-4613-9649-511e2e3a835c",
   "metadata": {},
   "source": [
    "# RF Classifier - Table E SI\n",
    "\n",
    "The results of the last cell should be compatible (i.e. accuracy metrics should be similar or better) than those shown in Table E of SI"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "773bcf6d-b9a2-4c06-817b-cb79dba796d2",
   "metadata": {},
   "outputs": [],
   "source": [
    "import nltk, string\n",
    "from nltk.tokenize import sent_tokenize, word_tokenize\n",
    "from nltk.corpus import stopwords\n",
    "import nltk.stem.snowball\n",
    "\n",
    "stemmer = nltk.stem.snowball.EnglishStemmer()\n",
    "\n",
    "exclude = list(string.punctuation)\n",
    "exclude.append(\"’\")\n",
    "exclude.append(\"“\")\n",
    "exclude.append(\"”\")\n",
    "\n",
    "exclude = set(exclude)\n",
    "\n",
    "def remove_stuff(s):  \n",
    "    s = s.lower()\n",
    "    for c in exclude:\n",
    "        s=s.replace(c,\" \")\n",
    "    s = s.split()\n",
    "    s = [stemmer.stem(c.lower()) for c in s if len(c)>1 and c not in stopwords.words()]\n",
    "    return s\n",
    "\n",
    "\n",
    "df_annotated = df[~df.is_na].copy()\n",
    "\n",
    "sentences = df_annotated['text'].values\n",
    "cleaned_sentences = [remove_stuff(s) for s in sentences]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "b3a6d885-dc4d-48e0-af0c-50379d0d65b7",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "tot words =  4460\n"
     ]
    }
   ],
   "source": [
    "\n",
    "words_list = []\n",
    "for s in cleaned_sentences: words_list+=s\n",
    "words_list = set(words_list)\n",
    "\n",
    "word_index = {}\n",
    "for w in words_list: word_index[w] = len(word_index) #così so dove stanno\n",
    "N = len(word_index)\n",
    "\n",
    "print(\"tot words = \",N)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "0d71a20a-5b02-4193-9580-62bff2ecc101",
   "metadata": {},
   "outputs": [],
   "source": [
    "def map_sentence(s):\n",
    "    x = np.zeros(N)\n",
    "    for w in s:\n",
    "        try: index = word_index[w]\n",
    "        except KeyError: continue\n",
    "        x[index] = 1\n",
    "    return x\n",
    "\n",
    "X = np.array([map_sentence(s) for s in cleaned_sentences])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "id": "b7ae126f-13e5-4d76-bd75-934223e0a66f",
   "metadata": {},
   "outputs": [],
   "source": [
    "np.random.seed(322)\n",
    "\n",
    "N_sample = X.shape[0]\n",
    "index = np.random.permutation(np.arange(N_sample))\n",
    "\n",
    "p = 0.8\n",
    "p_v = 0.9\n",
    "N_train = int(p*N_sample)\n",
    "N_valid = int(p_v*N_sample)\n",
    "\n",
    "index_train = index[:N_train]\n",
    "index_test = index[N_train:N_valid]\n",
    "index_valid = index[N_valid:]\n",
    "\n",
    "X_train = X[index_train]\n",
    "X_test = X[index_test]\n",
    "X_valid = X[index_valid]\n",
    "\n",
    "y = {annot:df_annotated[annot].values for annot in annotations}\n",
    "y_train = {annot:y[annot][index_train] for annot in annotations}\n",
    "y_test = {annot:y[annot][index_test] for annot in annotations}\n",
    "y_valid = {annot:y[annot][index_valid] for annot in annotations}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "id": "ad7def29-1078-4179-9faf-d1147d74f8b4",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "hate against out-groups  AuROC (test)=  0.8609798023205844\n",
      "hate against out-groups  AuROC (validation)=  0.8876745598057073\n",
      "hate against out-groups  accuracy score (test) =  0.953125\n",
      "hate against out-groups  accuracy score (validation) =  0.9322916666666666\n",
      " thresh =  0.165  fpr =  0.6153846153846154  tpr =  0.6153846153846154\n",
      "\n",
      "\n",
      "anger  AuROC (test)=  0.7356400951069954\n",
      "anger  AuROC (validation)=  0.7461084272678477\n",
      "anger  accuracy score (test) =  0.75\n",
      "anger  accuracy score (validation) =  0.7447916666666666\n",
      " thresh =  0.52  fpr =  0.29508196721311475  tpr =  0.29508196721311475\n",
      "\n",
      "\n",
      "fear and insecurity  AuROC (test)=  0.7068965517241379\n",
      "fear and insecurity  AuROC (validation)=  0.7616766467065867\n",
      "fear and insecurity  accuracy score (test) =  0.890625\n",
      "fear and insecurity  accuracy score (validation) =  0.8645833333333334\n",
      " thresh =  0.31  fpr =  0.2777777777777778  tpr =  0.2777777777777778\n",
      "\n",
      "\n",
      "indignation  AuROC (test)=  0.8014611087236785\n",
      "indignation  AuROC (validation)=  0.8431261770244821\n",
      "indignation  accuracy score (test) =  0.8489583333333334\n",
      "indignation  accuracy score (validation) =  0.8802083333333334\n",
      " thresh =  0.12  fpr =  0.5384615384615384  tpr =  0.5384615384615384\n",
      "\n",
      "\n",
      "joy and pride  AuROC (test)=  0.7319266381766382\n",
      "joy and pride  AuROC (validation)=  0.7903151065801668\n",
      "joy and pride  accuracy score (test) =  0.78125\n",
      "joy and pride  accuracy score (validation) =  0.8333333333333334\n",
      " thresh =  0.23  fpr =  0.3611111111111111  tpr =  0.3611111111111111\n",
      "\n",
      "\n"
     ]
    }
   ],
   "source": [
    "RF_sentimiento = {}\n",
    "thresholds_annot = {}\n",
    "AuROCs = {}\n",
    "AuROCs_valid = {}\n",
    "\n",
    "for annot in annotations:\n",
    "    rfc = RandomForestClassifier(n_estimators=200)\n",
    "    fit = rfc.fit(X_train,y_train[annot])\n",
    "    \n",
    "    y_prob = rfc.predict_proba(X_test)[:,1]\n",
    "    y_prob_valid= rfc.predict_proba(X_valid)[:,1]\n",
    "    y_pred = rfc.predict(X_test)\n",
    "    y_pred_valid= rfc.predict(X_valid)\n",
    "    \n",
    "    print(annot, \" AuROC (test)= \", sklearn.metrics.roc_auc_score(y_test[annot],y_prob))\n",
    "    print(annot, \" AuROC (validation)= \", sklearn.metrics.roc_auc_score(y_valid[annot],y_prob_valid))\n",
    "    \n",
    "    RF_sentimiento[annot] = rfc\n",
    "    \n",
    "    AuROCs[annot] = sklearn.metrics.roc_auc_score(y_test[annot],y_prob)\n",
    "    AuROCs_valid[annot] = sklearn.metrics.roc_auc_score(y_valid[annot],y_prob_valid)\n",
    "   \n",
    "    fpr, tpr, thresholds = sklearn.metrics.roc_curve(y_test[annot],y_prob)\n",
    "    fnr = 1 - fpr\n",
    "    tnr = 1 - tpr\n",
    "    \n",
    "    youdens = tpr/(tpr+fnr) + tnr/(tnr+fpr) - 1 \n",
    "    \n",
    "    max_index = youdens.argmax()\n",
    "    max_thresh = thresholds[max_index]\n",
    "    thresholds_annot[annot] = max_thresh\n",
    "    \n",
    "    \n",
    "    print(annot, \" accuracy score (test) = \", (y_test[annot] == (y_prob>=max_thresh)).mean())\n",
    "    print(annot, \" accuracy score (validation) = \", (y_valid[annot] == (y_prob_valid>=max_thresh)).mean() )\n",
    "    \n",
    "    print(\" thresh = \", max_thresh, \" fpr = \", tpr[max_index],  \" tpr = \", tpr[max_index]) \n",
    "    print(\"\\n\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "b3a04042-a6ea-47c3-bf6d-aec9c583f7cf",
   "metadata": {},
   "source": [
    "# Table F SI\n",
    "\n",
    "The results of the last cell should be compatible (i.e. accuracy metrics should be similar) to those shown in Table F of SI"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "id": "4b64359c-bb2a-4dc7-86ee-1637a722b777",
   "metadata": {},
   "outputs": [],
   "source": [
    "np.random.seed(322)\n",
    "\n",
    "N_sample = X.shape[0]\n",
    "index = np.random.permutation(np.arange(N_sample))\n",
    "\n",
    "p = 0.7\n",
    "p_v = 0.9\n",
    "N_train = int(p*N_sample)\n",
    "N_valid = int(p_v*N_sample)\n",
    "\n",
    "index_train = index[:N_train]\n",
    "index_test = index[N_train:N_valid]\n",
    "index_valid = index[N_valid:]\n",
    "\n",
    "X_train = X[index_train]\n",
    "X_test = X[index_test]\n",
    "X_valid = X[index_valid]\n",
    "\n",
    "y = {annot:df_annotated[annot].values for annot in annotations}\n",
    "y_train = {annot:y[annot][index_train] for annot in annotations}\n",
    "y_test = {annot:y[annot][index_test] for annot in annotations}\n",
    "y_valid = {annot:y[annot][index_valid] for annot in annotations}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "id": "9d172afc-10e1-4b84-b334-a94a2cb76f6f",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "hate against out-groups  AuROC (test)=  0.8777777777777778\n",
      "hate against out-groups  AuROC (validation)=  0.9019429265330904\n",
      "hate against out-groups  accuracy score (test) =  0.9477806788511749\n",
      "hate against out-groups  accuracy score (validation) =  0.9114583333333334\n",
      " thresh =  0.12  fpr =  0.6521739130434783  tpr =  0.6521739130434783\n",
      "\n",
      "\n",
      "anger  AuROC (test)=  0.731299337916985\n",
      "anger  AuROC (validation)=  0.7405394524959742\n",
      "anger  accuracy score (test) =  0.7362924281984334\n",
      "anger  accuracy score (validation) =  0.7447916666666666\n",
      " thresh =  0.465  fpr =  0.3277310924369748  tpr =  0.3277310924369748\n",
      "\n",
      "\n",
      "fear and insecurity  AuROC (test)=  0.7834472971229519\n",
      "fear and insecurity  AuROC (validation)=  0.7267065868263473\n",
      "fear and insecurity  accuracy score (test) =  0.8772845953002611\n",
      "fear and insecurity  accuracy score (validation) =  0.859375\n",
      " thresh =  0.255  fpr =  0.2826086956521739  tpr =  0.2826086956521739\n",
      "\n",
      "\n",
      "indignation  AuROC (test)=  0.782583284628872\n",
      "indignation  AuROC (validation)=  0.815442561205273\n",
      "indignation  accuracy score (test) =  0.8590078328981723\n",
      "indignation  accuracy score (validation) =  0.8802083333333334\n",
      " thresh =  0.125  fpr =  0.5172413793103449  tpr =  0.5172413793103449\n",
      "\n",
      "\n",
      "joy and pride  AuROC (test)=  0.6988040629095675\n",
      "joy and pride  AuROC (validation)=  0.779309545875811\n",
      "joy and pride  accuracy score (test) =  0.793733681462141\n",
      "joy and pride  accuracy score (validation) =  0.8125\n",
      " thresh =  0.215  fpr =  0.30357142857142855  tpr =  0.30357142857142855\n",
      "\n",
      "\n"
     ]
    }
   ],
   "source": [
    "RF_sentimiento = {}\n",
    "thresholds_annot = {}\n",
    "AuROCs = {}\n",
    "AuROCs_valid = {}\n",
    "\n",
    "for annot in annotations:\n",
    "    rfc = RandomForestClassifier(n_estimators=200)\n",
    "    fit = rfc.fit(X_train,y_train[annot])\n",
    "    \n",
    "    y_prob = rfc.predict_proba(X_test)[:,1]\n",
    "    y_prob_valid= rfc.predict_proba(X_valid)[:,1]\n",
    "    y_pred = rfc.predict(X_test)\n",
    "    y_pred_valid= rfc.predict(X_valid)\n",
    "    \n",
    "    print(annot, \" AuROC (test)= \", sklearn.metrics.roc_auc_score(y_test[annot],y_prob))\n",
    "    print(annot, \" AuROC (validation)= \", sklearn.metrics.roc_auc_score(y_valid[annot],y_prob_valid))\n",
    "    \n",
    "    RF_sentimiento[annot] = rfc\n",
    "    \n",
    "    AuROCs[annot] = sklearn.metrics.roc_auc_score(y_test[annot],y_prob)\n",
    "    AuROCs_valid[annot] = sklearn.metrics.roc_auc_score(y_valid[annot],y_prob_valid)\n",
    "   \n",
    "    fpr, tpr, thresholds = sklearn.metrics.roc_curve(y_test[annot],y_prob)\n",
    "    fnr = 1 - fpr\n",
    "    tnr = 1 - tpr\n",
    "    \n",
    "    youdens = tpr/(tpr+fnr) + tnr/(tnr+fpr) - 1 \n",
    "    \n",
    "    max_index = youdens.argmax()\n",
    "    max_thresh = thresholds[max_index]\n",
    "    thresholds_annot[annot] = max_thresh\n",
    "    \n",
    "    \n",
    "    print(annot, \" accuracy score (test) = \", (y_test[annot] == (y_prob>=max_thresh)).mean())\n",
    "    print(annot, \" accuracy score (validation) = \", (y_valid[annot] == (y_prob_valid>=max_thresh)).mean() )\n",
    "    \n",
    "    print(\" thresh = \", max_thresh, \" fpr = \", tpr[max_index],  \" tpr = \", tpr[max_index]) \n",
    "    print(\"\\n\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "567e095b-998e-4008-9183-a070202cdd79",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.8"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
